### This R script is to make Fig 1. The statistical significance tests are run using weighted regressions.

# open necessary packages
library(tidyverse)
library(Hmisc)
library(dotwhisker)
library(fixest)

# set your directory:
setwd("/Users/olgabaker/Desktop/Replication Package Summer 2025/Data")
data_17 <- read.csv("core_17.csv")
data_18 <- read.csv("core_18.csv")
data_19 <- read.csv("core_19.csv")
data_21 <- read.csv("core_21.csv")
data_22 <- read.csv("core_22.csv")
data_23 <- read.csv("core_23.csv")

data <- list(data_17,data_18,data_19,data_21,data_22,data_23) 
names(data) <- paste("data_",c(17:19,21:23),sep="")

# drop individual data objects
rm(data_17)
rm(data_18)
rm(data_19)
rm(data_21)
rm(data_22)
rm(data_23)

## Make names of teacher variables consistent across years. Rename variables if their variable name changed in a given 
# year. Change it to match the name in the "variable name" column of "Teacher Variables.xlsx"
data$data_18 <- data$data_18 %>% rename(cdis1=cdis)
data$data_19 <- data$data_19 %>% rename(cdis1=cdis)

## import list of teacher variables
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.name
varnames <- varnames[!is.na(varnames)] 

# add simple_avg to varnames
varnames <- c(varnames,"simple_avg")

### 1) Need to calculate weighted average scores for each teacher variable and year.
### Weight by propensity_score*teacher_n for 2017, 2018 and by teacher_n for 2019-2023

# drop schools without propensity scores
data$data_17 <- filter(data$data_17, !is.na(propensity_score)) # dropping 7 schools
data$data_18 <- filter(data$data_18, !is.na(propensity_score)) # dropping 9 schools 

## Label the weights you'll use for each year as "weight"
data$data_17 <- mutate(data$data_17, weight=propensity_score*teacher_n)
data$data_18 <- mutate(data$data_18, weight=propensity_score*teacher_n)
data$data_19 <- mutate(data$data_19, weight=teacher_n)
data$data_21 <- mutate(data$data_21, weight=teacher_n)
data$data_22 <- mutate(data$data_22, weight=teacher_n)
data$data_23 <- mutate(data$data_23, weight=teacher_n)

## Next write a function that calculates weighted average
weighted_avg <- function(x,var){ # be sure to type the variable name with "" surrounding it
  if(!is.null(x[[var]])){
    return(weighted.mean(x[[var]],x[["weight"]], na.rm=T))
  } else{return(NA)}}

# check work:
prti_avg <- sapply(data, function(x) weighted_avg(x, "prti"))

## Apply the function to all variables
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), length(data))
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(data, function(x) weighted_avg(x, var))
}

weighted_scores

### 2) Take differences. Need the following:
# 2017-2023
# 2019-2023
# 2017-2019
# 2019-2021
# 2021-2023
Table <- matrix(NA, length(varnames), 5)
Table <- as.data.frame(Table)
# note that d_ stands for difference:
colnames(Table) <- c("d_17_23", "d_19_23", "d_17_19", "d_19_21", "d_21_23") 
rownames(Table) <- rownames(weighted_scores)

# Fill in columns
weighted_scores <- as.data.frame(weighted_scores)
Table$d_17_23 <- weighted_scores$data_23 - weighted_scores$data_17
Table$d_19_23 <- weighted_scores$data_23 - weighted_scores$data_19
Table$d_17_19 <- weighted_scores$data_19 - weighted_scores$data_17
Table$d_19_21 <- weighted_scores$data_21 - weighted_scores$data_19
Table$d_21_23 <- weighted_scores$data_23 - weighted_scores$data_21

### 3) Run regressions and store std errors. 

# For regression on construct c and difference over 2017-2023,
# a) Stack 2017 and 2023 data
# b) Run regressions of the following form for each variable:

# Y_ict = B0 + B1*T_t + epsilon_ict
# Where i = school, c = construct (survey variable), t = time 
# T is a year dummy (= 1 if data is from 2023)
# run the regressions as weighted (GLS) regressions and cluster by district

# c) store the std error on B1

## write function to run statistical significance test:
# data_1 is the earlier year, data_2 is the later year
wtd.reg <- function(data_1, data_2, var){ # be sure to type variable name with ""
  if(is.null(data_1[[var]]) | is.null(data_2[[var]])){ # clause that gives an NA if the variable is not available in one of the years
    output <- c(NA,NA); names(output) <- c("coefficient","std error")}
  else{
    stack_1 <- data.frame(construct=data_1[[var]], weight=data_1[["weight"]],leaid=data_1[["leaid"]])
    stack_2 <- data.frame(construct=data_2[[var]], weight=data_2[["weight"]],leaid=data_2[["leaid"]])
    stack_1$later_year_dummy <- 0
    stack_2$later_year_dummy <- 1
    stack <- rbind(stack_1, stack_2)
    
    regression <- feols(construct ~ later_year_dummy,  cluster=~leaid, data=stack, weights=~weight)
    coeff_B1 <- coef(regression)["later_year_dummy"]
    std_error_B1 <- se(regression)["later_year_dummy"]
    output <- c(coeff_B1,std_error_B1); names(output) <- c("coefficient","std error")}
  return(output)
}

# check work
test <- wtd.reg(data$data_17, data$data_23, "colb")
diff_in_means <- Table$d_17_23[which(rownames(Table)=="colb")]
test[1]-diff_in_means # should be very small amount

### get std error for each variable and difference - fill in std errors by column 
Table_std_errors <- matrix(NA, nrow(Table), ncol(Table))
Table_std_errors <- as.data.frame(Table_std_errors)
colnames(Table_std_errors) <- colnames(Table)
rownames(Table_std_errors) <- rownames(Table)

# d_17_23
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$d_17_23[i] <- wtd.reg(data$data_17, data$data_23, var)[2]
}

# d_19_23
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$d_19_23[i] <- wtd.reg(data$data_19, data$data_23, var)[2]
}

# d_17_19
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$d_17_19[i] <- wtd.reg(data$data_17, data$data_19, var)[2]
}

# d_19_21
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$d_19_21[i] <- wtd.reg(data$data_19, data$data_21, var)[2]
}

# d_21_23
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$d_21_23[i] <- wtd.reg(data$data_21, data$data_23, var)[2]
}
Table_std_errors

# check work
Table_std_errors<0

### Divide by 20 and drop 'data' variable
Table <- Table/20
Table_std_errors <- Table_std_errors/20
Table <- Table[-which(rownames(Table)=='data'),]
Table_std_errors <- Table_std_errors[-which(rownames(Table_std_errors)=='data'),]

### 4) Make a dot and whisker plot

# need columns 'term', 'estimate', 'std.error' and 'model'
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.definition
varnames <- varnames[!is.na(varnames)]
varnames <- varnames[-which(varnames=="Collective Use of Assessment Data")]
varnames <- c(varnames, "Simple Average")

# combine many timespans
plot.data.19.23 <- data.frame(term=varnames,estimate=Table$d_19_23,
                              std.error=Table_std_errors$d_19_23,model="2019-2023")
plot.data.17.19 <- data.frame(term=varnames,estimate=Table$d_17_19,
                              std.error=Table_std_errors$d_17_19,model="2017-2019")
plot.data.19.21 <- data.frame(term=varnames,estimate=Table$d_19_21,
                              std.error=Table_std_errors$d_19_21,model="2019-2021")
plot.data.21.23 <- data.frame(term=varnames,estimate=Table$d_21_23,
                              std.error=Table_std_errors$d_21_23,model="2021-2023")

plot.data <- rbind(plot.data.19.23,plot.data.17.19,plot.data.19.21,plot.data.21.23)

dwplot(plot.data,vline = geom_vline(xintercept = 0, colour = "grey50", 
                                    linetype = 2)) +
  facet_wrap(~factor(model, levels=c("2019-2023","2017-2019","2019-2021","2021-2023")),
             ncol=4)+theme(legend.position="none") 

##### color code dots
plot.data.backup <- plot.data
plot.data$color <- ifelse(plot.data$estimate>=0,"positive","negative")

dwplot(plot.data, dot_args = list(aes(color=color)), vline = geom_vline(xintercept = 0, colour = "grey50", 
                                    linetype = 2)) +
  facet_wrap(~factor(model, levels=c("2019-2023","2017-2019","2019-2021","2021-2023")),
             ncol=4)+theme(legend.position="none") 

dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  facet_wrap(~factor(model, levels=c("2019-2023","2017-2019","2019-2021","2021-2023")),
             ncol=4)+theme(legend.position="none") 

dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  scale_color_manual(values=c("positive"="green","negative"="red"))+
  facet_wrap(~factor(model, levels=c("2019-2023","2017-2019","2019-2021","2021-2023")),
             ncol=4)+theme(legend.position="none") 

# Darken the green
dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  scale_color_manual(values=c("positive"="green4","negative"="red"))+
  facet_wrap(~factor(model, levels=c("2019-2023","2017-2019","2019-2021","2021-2023")),
             ncol=4)+theme(legend.position="none") #>> export as 1300x700

